"""Selenium 爬取拉勾网信息"""

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import json
import time


def lagou_search_url(keyword, city, district=None):
    url_base = 'https://www.lagou.com/jobs/'
    if district:
        return (url_base + 'list_' + keyword
                + '?px=default&city=' + city
                + '&district=' + district)
    else:
        return (url_base + 'list_' + keyword
                + '?px=default&city=' + city)


_keyword = '深度学习'
_city = '上海'
_id = 0
_outfile_name = 'lagou.json'
res = {}

# 无头模式浏览器参数
chrome_options = Options()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--disable-gpu')
driver = webdriver.Chrome(chrome_options=chrome_options)
print('[INFO] 浏览器已开启 ...')

# 城市+关键词搜索结果页
driver.get(lagou_search_url(_keyword, _city))
districts_elem = driver.find_elements_by_xpath(
    '//div[@data-type="district"]/a[not(@class)]')
districts = list(map(lambda x: x.text, districts_elem))
print('[INFO] 城市+关键词搜索页面打开完成 ...')
print()

for dist in districts:
    print(f'[INFO] 当前地区：{dist}')
    cur_search_url = lagou_search_url(_keyword, _city, dist)
    driver.get(cur_search_url)
    max_page = int(driver.find_element_by_css_selector('.span.totalNum').text)

    # 空搜索
    if max_page == 0:
        continue

    cur_page = 1

    # 多页处理
    while True:
        print(f'[INFO] 当前页面：{cur_page}/{max_page}')

        names_elem = driver.find_elements_by_css_selector('.company_name > a')
        industry_elem = driver.find_elements_by_css_selector('div.industry')
        jobs_elem = driver.find_elements_by_xpath(
            '//h3[@style="max-width: 180px;"]')
        money_elem = driver.find_elements_by_css_selector('.money')

        names = [x.text for x in names_elem]
        industry = [x.text for x in industry_elem]
        jobs = [x.text for x in jobs_elem]
        money = [x.text for x in money_elem]

        # print('[INFO] 店家：', ' | '.join(names), sep='')

        for n, i, j, m in zip(names, industry, jobs, money):
            _id += 1
            res[_id] = {}
            res[_id]['公司名'] = n
            res[_id]['公司描述'] = i
            res[_id]['职位'] = j
            res[_id]['薪资'] = m

        if cur_page < max_page:
            driver.execute_script('$(".pager_next").click()')
            cur_page += 1
            time.sleep(5)
        else:
            break

json.dump(res, open(_outfile_name, 'w', encoding='utf-8'), ensure_ascii=False,
          indent=4)
